Hypothesis: Media tends to bias reporting towards reader preferences
Data:
Methodology: To measure the political slant of german online newspaper, the topics addressed in newspapers are compared with topics addressed in press releases of political parties…
set.seed(4556)
library(stm)
library(tidyverse)
library(dplyr)
library(ggthemes)
library(tidytext)
library(patchwork)
rm(list = ls())
color1 <- "#778899"
color2 <- "#808080"
color3 <- "#000000"
# load press releases
load("../output/pressReleases.Rda")
# load news article data
load("../output/data_step2.Rda")
# combine both
model_df <- btw %>%
dplyr::mutate(date = as.Date(date),
type = "news",
source = medium) %>%
bind_rows(.,pressReleases %>%
mutate(source = party)) %>%
dplyr::mutate(doc_index = as.numeric(rownames(.)))
model_df %>%
group_by(type, source) %>%
tally() %>%
ggplot(aes(reorder(source, n),n, fill = type)) +
geom_col(show.legend = F) +
theme_hc() +
coord_flip() +
labs(y="# documents", x=NULL, title = "Number of documents", subtitle = "June 2017 - March 2018") +
facet_wrap(~type, scales = "free")
Before pre-processing:
Pre-processed Data:
The statistic tf-idf (term frequency - inverse document frequency) is intended to measure how important a word is to a document in a collection (or corpus) of documents.
The inverse document frequency for any given term is defined as
\[ idf\text{(term)}=\frac{n_{\text{documents}}}{n_{\text{documents containing term}}} \]
To measure the tone (or sentiment) of an article a dictionary-based method is applied. To conduct such an analysis, a list of words (dictionary) associated with a given emotion, such as negativity is pre-defined. The document is then deconstructed into individual words and each word is assigned a sentiment value according to the dictionary, where the sum of all values results in the emotional score for the given document. Such lexical or “bag-of-words” approaches are widely presented in the finance literature to determine the effect of central banks’ monetary policy communications on asset prices and real variables.
The present paper uses a dictionary that lists words associated with positive and negative polarity weighted within the interval of \([-1; 1]\). SentimentWortschatz, is a publicly available German-language resource for sentiment analysis, opinion mining, etc.. The current version of SentiWS (v1.8b) contains 1,650 positive and 1,818 negative words, which sum up to 15,649 positive and 15,632 negative words including their inflections, respectively.
The sentiment score for each document \(d\) is calculated based on the weighted polarity values for a word, defined on an interval between -1 and 1. The score is then calculated from the sum of the words in a document (which can be assigned to a word from the dictionary) divided by the total number of words in that document:
\[ \text{SentScore}_d = \frac{|\text{positive polarity score}_d| - |\text{negative polarity score}_d|}{|\text{TotalWords}_d|} \]
sent <- c(
# positive Wörter
readLines("dict/SentiWS_v1.8c_Negative.txt",
encoding = "UTF-8"),
# negative W??rter
readLines("dict/SentiWS_v1.8c_Positive.txt",
encoding = "UTF-8")
) %>%
lapply(function(x) {
# Extrahieren der einzelnen Spalten
res <- strsplit(x, "\t", fixed = TRUE)[[1]]
return(data.frame(words = res[1], value = res[2],
stringsAsFactors = FALSE))
}) %>%
bind_rows %>%
mutate(word = gsub("\\|.*", "", words) %>%
tolower, value = as.numeric(value),
type = gsub(".*\\|", "", words)) %>%
# nur adjektive oder adverben
# filter(type == "ADJX" | type == "ADV") %>%
# manche Wörter kommen doppelt vor, hier nehmen wir den mittleren Wert
group_by(word) %>%
dplyr::summarise(polarity = mean(value)) %>% ungroup %>%
# Delete "Heil" (wegen Hubertus Heil)
filter(!grepl('heil',word,ignore.case = T)) %>%
# welcome to hell (g20)
filter(!grepl('hell',word,ignore.case = T)) %>%
# filter values that that score between -0.1 and +0.1
filter(!dplyr::between(polarity, -0.1,0.1))
sent_token <- model_df %>%
select(doc_index, date, source, title_text) %>%
unnest_tokens(word, title_text) %>% left_join(., sent, by = "word")
sent_df <- sent_token %>%
group_by(doc_index) %>%
dplyr::summarise(n = n(),
polarity_sum = sum(polarity, na.rm = T),
sentiment = polarity_sum/n)
save(sent_df, file="../output/sentiment.Rda")
model_df <- left_join(model_df, sent_df, by="doc_index")
plot <- model_df %>%
mutate(sentiment = sentiment*1000) %>%
group_by(type) %>%
mutate(avg_sent_type = median(sentiment, na.rm = T)) %>%
ungroup() %>%
group_by(source, type) %>%
dplyr::summarise(avg_sent_type = mean(avg_sent_type, na.rm = T),
sent_mean = median(sentiment, na.rm = T)) %>% ungroup()
ggplot(plot, aes(source, sent_mean,
label = round(sent_mean,2),
color = type)) +
geom_point(show.legend = F, alpha = 0.8, size=3) +
geom_hline(aes(yintercept =avg_sent_type), linetype=2, color = "darkred") +
geom_hline(aes(yintercept =0), color = color1) +
geom_text(show.legend = F, color="black", size=2, vjust=-1) +
coord_flip() +
facet_grid(type~., scales = "free_y") +
labs(x=NULL, y=NULL, title = "Sentiment score") +
theme_hc()
#ggsave("../figs/sentiment.png", height = 4, width = 6, dpi=300)
Parties want the media agenda to be congruent with their own agenda to define the issue-based criteria on which they will be evaluated by voters ( Eberl, 2017 ). Thus, parties instrumentalize their press releases in order to highlight issues that they are perceived to be competent on, that they “own” and that are important to their voters ( Kepplinger & Maurer, 2004 ). Editors can select from this universe and decide which of these topics will be discussed in the news. In that sense the ideological content of a newspaper refers to the extent to which the topics promoted by the parties correlate with the topics discussed in the news articles.
To discover the latent topics in the corpus of press releases and news articles, a structural topic modeling (STM) developed by Roberts (2016) is applied. The STM is an unsupervised machine learning approach that models topics as multinomial distributions of words and documents as multinomial distributions of topics, allowing to incorporate external variables that effect both, topical content and topical prevalence.
STM assumes a fixed user-specified number of topics. There is not a “right” answer to the number of topics that are appropriate for a given corpus (Grimmer and Stewart 2013). Roberts et al. (2016) propose to measure topic quality through a combination of semantic coherence and exclusivity of words to topics. Semantic coherence is a criterion developed by Mimno et al. (2011) and is closely related to pointwise mutual information (Newman et al. 2010): it is maximized when the most probable words in a given topic frequently co-occur together.
Using the function searchK several automated tests are performed to help choose the number of topics including the average exclusivity and semantic coherence as well as the held out likelihood (Wallach et al. 2009) and the residuals (Taddy 2012).
This process revealed that a model with 50 topics best reflects the structure in the corpus. Furthermore, the source of a document (each party as well as each media outlet represent one source) is used as covariate in the topic prevalence. In other words, I assume that the probability distribution of topics for a specific document is influenced by the source of that document. Additionally the type of that source (news website or party) is used as a covariate for the term frequency as I assume that the words used for the same topic differ between news articles or press releases.
library(stm)
library(tidyverse)
library(tidytext)
library(ggthemes)
library(xtable)
rm(list = ls())
color <- "#b7b7b7"
color1 <- "#778899"
color2 <- "#808080"
color3 <- "#000000"
source("func/functions.R")
# stm results
load("../output/models/finalmodel_50.RDa")
# sentiment results
load("../output/sentiment.Rda")
k <- stmOut$settings$dim$K
model_df <- model_df %>%
dplyr::mutate(doc_index = as.numeric(rownames(.))) %>%
left_join(., sent_df, by="doc_index")
stmOut$settings$call
## stm(documents = out$documents, vocab = out$vocab, K = k, prevalence = ~source,
## content = ~type, data = out$meta, init.type = "Spectral",
## max.em.its = 75)
length(stmOut$vocab)
## [1] 74343
To explore the words associated with each topic we use the words with the highest probability in each topic. As we included the source type (press release or news paper) as a control for the topical content (the word distribution of each topic), we have two different labels for each topic.
sagelabs <- sageLabels(stmOut, 20)
newsLabels <- as.data.frame(sagelabs$cov.betas[[1]]$problabels) %>%
transmute(topic = as.numeric(rownames(.)),
topic_name_news = paste(V1,V2,V3))
pressLabels <- as.data.frame(sagelabs$cov.betas[[2]]$problabels) %>%
transmute(topic = as.numeric(rownames(.)),
topic_name_press = paste(V1,V2,V3))
topics.df <- left_join(newsLabels, pressLabels, by="topic") %>%
mutate(label1 = paste(topic_name_news, topic_name_press))
for (i in seq(k)) {
label <- paste(unique(unlist(strsplit(topics.df$label1[i], " "))), collapse = " ")
topics.df$joint_label[i] <- paste("Topic",topics.df$topic[i],":", label)
}
topics.df %>%
select(joint_label, topic_name_news, topic_name_press) %>%
htmlTable::htmlTable(align="l", header = c("Joint label", "News articles","Press releases"),
rnames = F)
| Joint label | News articles | Press releases |
|---|---|---|
| Topic 1 : schulz spd martin sichert | schulz spd martin | schulz martin sichert |
| Topic 2 : wahlkampf duell tv kanzlerin merkel wähler | wahlkampf duell tv | kanzlerin merkel wähler |
| Topic 3 : grünen cdu niedersachsen schwarz jamaika | grünen cdu niedersachsen | grünen schwarz jamaika |
| Topic 4 : spd union groko koalit koalitionsvertrag großen | spd union groko | koalit koalitionsvertrag großen |
| Topic 5 : the of to start up | the of to | start up the |
| Topic 6 : diesel auto fahrverbot bundesregierung | diesel auto fahrverbot | diesel bundesregierung auto |
| Topic 7 : twitter facebook netz medien meinungsfreiheit | twitter facebook netz | medien facebook meinungsfreiheit |
| Topic 8 : seehof csu söder sozial bayern gerechtigkeit | seehof csu söder | sozial bayern gerechtigkeit |
| Topic 9 : merkel angela kanzlerin bundeskanzlerin | merkel angela kanzlerin | merkel angela bundeskanzlerin |
| Topic 10 : trump russland schröder sanktionen nato | trump russland schröder | russland sanktionen nato |
| Topic 11 : koalit spd neuwahlen digitalisierung deutschland bildung | koalit spd neuwahlen | digitalisierung deutschland bildung |
| Topic 12 : afghanistan abschiebung abschiebungen bundesregierung jelpk | afghanistan abschiebung abschiebungen | afghanistan bundesregierung jelpk |
| Topic 13 : stadt flüchtling innenminist flüchtlingen grenzen | stadt flüchtling innenminist | flüchtling flüchtlingen grenzen |
| Topic 14 : prozent spd afd theurer jahr | prozent spd afd | prozent theurer jahr |
| Topic 15 : israel antisemitismu juden dr europarat demokrati | israel antisemitismu juden | dr europarat demokrati |
| Topic 16 : nordrhein westfalen nrw regionen osten ländlichen | nordrhein westfalen nrw | regionen osten ländlichen |
| Topic 17 : euro milliarden geld rent | euro milliarden geld | euro milliarden rent |
| Topic 18 : spd verhandlungen union zentral wichtig | spd verhandlungen union | zentral wichtig verhandlungen |
| Topic 19 : afd gauland weidel alic | afd gauland weidel | weidel alic gauland |
| Topic 20 : berlin polizei amri air bundesregierung | berlin polizei amri | berlin air bundesregierung |
| Topic 21 : maa steinmeier gesetz bundestag gesetzentwurf | maa steinmeier gesetz | gesetz bundestag gesetzentwurf |
| Topic 22 : kohl helmut kanzler europa macron deutsch | kohl helmut kanzler | europa macron deutsch |
| Topic 23 : fdp grünen jamaika frage lindner | fdp grünen jamaika | frage lindner grünen |
| Topic 24 : bundeswehr soldaten leyen bundesregierung nato | bundeswehr soldaten leyen | bundeswehr bundesregierung nato |
| Topic 25 : frauen kinder männer gewalt opfer | frauen kinder männer | frauen gewalt opfer |
| Topic 26 : csu cdu union sude obergrenz seehof | csu cdu union | sude obergrenz seehof |
| Topic 27 : hamburg gipfel polizei trump usa | hamburg gipfel polizei | trump usa gipfel |
| Topic 28 : afd petri partei poggenburg sachsen andré | afd petri partei | poggenburg sachsen andré |
| Topic 29 : verfassungsschutz deutschland bka erdogan bundesregierung | verfassungsschutz deutschland bka | erdogan deutschland bundesregierung |
| Topic 30 : daten informationen unternehmen banken de ezb | daten informationen unternehmen | banken de ezb |
| Topic 31 : raf berlin schleyer verbrauch behörden daten | raf berlin schleyer | verbrauch behörden daten |
| Topic 32 : flüchtling jahr zahl kommunen mietpreisbrems wohnungsbau | flüchtling jahr zahl | kommunen mietpreisbrems wohnungsbau |
| Topic 33 : welt menschen politik menschenrecht deutschland | welt menschen politik | menschen menschenrecht deutschland |
| Topic 34 : bundestag afd fraktion antrag deutschen | bundestag afd fraktion | bundestag antrag deutschen |
| Topic 35 : muslim islam kirch kind herdt deutschland | muslim islam kirch | kind herdt deutschland |
| Topic 36 : prozess gericht bundesanwaltschaft türkei menschenrecht türkischen | prozess gericht bundesanwaltschaft | türkei menschenrecht türkischen |
| Topic 37 : link linken partei gabriel außenminist politik | link linken partei | gabriel außenminist politik |
| Topic 38 : zdf sendung talk öffentlich rechtlichen rundfunk | zdf sendung talk | öffentlich rechtlichen rundfunk |
| Topic 39 : türkei erdogan deutschland eu lambsdorff europäischen | türkei erdogan deutschland | eu lambsdorff europäischen |
| Topic 40 : höcke afd npd georg pazderski brandner | höcke afd npd | georg pazderski brandner |
| Topic 41 : cdu spahn politik tauber heiner | cdu spahn politik | tauber heiner politik |
| Topic 42 : deutschland europa macron klimaschutz landwirtschaft | deutschland europa macron | deutschland klimaschutz landwirtschaft |
| Topic 43 : eu deutschland europa | eu deutschland europa | eu deutschland europa |
| Topic 44 : familiennachzug flüchtling deutschland migranten | familiennachzug flüchtling deutschland | deutschland familiennachzug migranten |
| Topic 45 : cdu spd twesten de bundestag maizièr | cdu spd twesten | de bundestag maizièr |
| Topic 46 : berlin jahr tag opfer freiheit | berlin jahr tag | opfer tag freiheit |
| Topic 47 : august cdu spd wahl hondura wähler | august cdu spd | wahl hondura wähler |
| Topic 48 : ge ten be nen re | ge ten be | ge nen re |
| Topic 49 : kinder deutschland studi bildung | kinder deutschland studi | kinder bildung deutschland |
| Topic 50 : terror köln ring endlich muslim terrorismu | terror köln ring | endlich muslim terrorismu |
theta <- as.data.frame(stmOut$theta) %>% # get all theta values for each document
mutate(doc_index = as.numeric(rownames(.))) %>%
# convert to long format
gather(topic, theta, -doc_index) %>%
mutate(topic = as.numeric(gsub("V","",topic))) %>%
# join with topic df
left_join(., topics.df, by="topic") %>%
# join with model_df
left_join(., model_df %>%
select(date,type,source,doc_index,title_text,sentiment), by="doc_index") %>%
# delete documents that are published in Mai 2017
mutate(
year = lubridate::year(date),
month = lubridate::month(date)
) %>%
filter(month != 5)
Each document has a probabilty distribution over all topics, e.g.
# select a random document
doc <- sample(unique(theta$doc_index),1)
sample <- theta %>% filter(doc_index == doc)
caption <- model_df %>% filter(doc_index == doc) %>% select(title, source)
sample %>%
ggplot(aes(reorder(joint_label,desc(topic)), theta)) +
geom_col(fill = color1) +
coord_flip() +
ylim(c(0,1)) +
theme_hc() +
labs(x = NULL, y = NULL, caption = paste("title:",caption$title,"(",caption$source,")"))
#ggsave("../figs/doc_topic_distr.png", height = 8, width = 8)
What is the document about?
sample$title_text[1]
## [1] "Bundestagsbericht - Geheimdienste überwachten 3747 Telefonanschlüsse Die deutschen Geheimdienste haben im vergangenen Jahr deutlich mehr Arbeit geleistet als 2015 - jedenfalls was Überwachung im Bereich der Telekommunikation angeht.\n\n► Insgesamt wurden 3747 Telekommunikations-Anschlüsse überwacht, das sind 32 Prozent mehr als im Vorjahr (2838)!\n\nLaut Bericht des Parlamentarischen Kontrollgremiums des Bundestages (PKG) waren von den Abhörmaßnahmen 261 Personen betroffen. 2015 waren es 193 (plus 35 Prozent). Die meisten Maßnahmen wurden vom Bundesamt für Verfassungsschutz angeordnet.\n\nIm ersten Halbjahr wurden 432 Hinweise des Verfassungsschutzes aus Abhörmaßnahmen an 33 ausländische Dienste weitergegeben. Im zweiten Halbjahr gingen 542 Meldungen an 39 ausländische Dienste.\n\n* Der Bundesnachrichtendienst durchforstete Telekommunikations-Verkehr (z. B. E-Mails) mithilfe von 2307 Suchbegriffen auf \"internationalem Terrorismus\". 34 Fälle waren \"nachrichtendienstlich relevant\".\n\n* Im Bereich illegaler Rüstungsexporte wurden 379 Suchbegriffe verwendet und 19 Hinweise entdeckt."
The figure below displays the topics ordered by their expected frequency across the corpus.
topicmean <- theta %>%
group_by(topic, joint_label) %>%
summarise(frequency = mean(theta)) %>%
ungroup() %>%
arrange(desc(frequency)) %>%
mutate(order = row_number())
topicmean %>%
ggplot(aes(reorder(joint_label, -order),frequency, fill=frequency)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_hc() +
labs(x=NULL, y=NULL)
#ggsave("../figs/topic_proportion.png", height = 8, width = 8)
For each source the average distribution of each topic is calculated.
topicmean_news <- theta %>%
filter(type == "news") %>%
group_by(topic,joint_label,source) %>%
summarise(frequency = mean(theta, na.rm = T)) %>%
ungroup()
topicmean_press <- theta %>%
filter(type == "press") %>%
group_by(topic,joint_label, source) %>%
summarise(frequency = mean(theta, na.rm = T)) %>%
ungroup()
topicmean_news %>%
ggplot(aes(reorder(joint_label,desc(topic)),
frequency, fill=frequency)) +
geom_col(show.legend = F) +
coord_flip() +
theme_hc() +
facet_grid(~source) +
#scale_fill_gradient2(limits=c(0,0.25), low="white", mid="blue", high="red") +
scale_y_continuous(limits = c(0,0.25), breaks = c(0,0.1,0.2)) +
labs(x=NULL, y=NULL)
#theme(axis.text.y = element_blank())
#ggsave("../figs/topic_proportion_news.png", width = 10, height =8)
topicmean_press %>%
ggplot(aes(reorder(joint_label,desc(topic)),
frequency, fill=frequency)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_hc() +
facet_grid(~source) +
#scale_fill_gradient2(limits=c(0,0.25), low="white", mid="blue", high="red") +
scale_y_continuous(limits = c(0,0.25), breaks = c(0,0.1,0.2)) +
labs(x=NULL, y=NULL)
#ggsave("../figs/topic_proportion_press.png", width = 10, height =8)
To combine the sentiment value with the topic probability of each document, the sentiment score is multiplied with the \(k\)x\(1\) vector for each document. Subsequently, to calculate the average topic sentiment for a source \(s\), the mean value of all documents belonging to source \(s\) is calculated. This results in a \(k\)x\(1\) vector representing the mean distributions \(\bar{ \theta_{s} }\) of this source, weighted by the sentiment scores:
\[ \bar{ \theta_{s} } = \begin{bmatrix} \bar{ \theta_{1} } \\ . \\ . \\ \bar{ \theta_{k} } \\ \end{bmatrix} \]
theta <- theta %>% mutate(sentiment_theta = sentiment*theta)
topicsent_news <- theta %>%
filter(type == "news") %>%
group_by(topic,joint_label,source) %>%
dplyr::summarise(sent_theta = mean(sentiment_theta, na.rm = T)) %>%
ungroup()
topicsent_press <- theta %>%
filter(type == "press") %>%
group_by(topic,joint_label, source) %>%
summarise(sent_theta = mean(sentiment_theta, na.rm = T)) %>%
ungroup()
topicsent_news %>%
mutate(sent_theta = sent_theta*1000) %>%
ggplot(aes(reorder(joint_label,desc(topic)),
sent_theta, fill=sent_theta)) +
geom_col(show.legend = F) +
coord_flip() +
theme_hc() +
facet_grid(.~source) +
scale_y_continuous(limits = c(-0.7,0.2), breaks = c(-0.6,-0.4,-0.2,0)) +
#scale_fill_gradient2(limits=c(0,0.25), low="white", mid="blue", high="red") +
labs(x=NULL, y=NULL) +
theme(axis.text.x = element_text(size = 6)
#axis.text.y = element_blank()
)
#ggsave("../figs/topic_sent_news.png", width = 10, height =8)
topicsent_press %>%
mutate(sent_theta = sent_theta*1000) %>%
ggplot(aes(reorder(joint_label,desc(topic)),
sent_theta, fill=sent_theta)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_hc() +
facet_grid(~source) +
#scale_fill_gradient2(limits=c(0,0.25), low="white", mid="blue", high="red") +
scale_y_continuous(limits = c(-0.7,0.2), breaks = c(-0.6,-0.4,-0.2,0)) +
labs(x=NULL, y=NULL) +
theme(axis.text.x = element_text(size = 6))
#ggsave("../figs/topic_sent_press.png", width = 10, height =8)
On the basis of thes weighted topic distributions, the bivariate Pearson correlation coefficients are calculated for each pair of media outlet and party. The higher the correlation coefficient, the higher the slant index for a party in a media outlet.
library(Hmisc)
library(ggcorrplot)
library(corrr)
news <- c("DIE WELT","stern.de", "ZEIT ONLINE", "FOCUS Online", "Bild.de", "SPIEGEL ONLINE", "tagesschau.de" )
parties <- c("CDU", "SPD", "AfD", "B90/GRÜNE", "DIE LINKE", "FDP" )
corr.df <- bind_rows(topicsent_press, topicsent_news) %>%
select(-joint_label) %>% spread(source, sent_theta)
x <- as.matrix(corr.df[,-1])
rs <- correlate(x)
rs %>%
network_plot()
library(patchwork)
plot <- rs %>%
as_tibble() %>%
dplyr::rename(source1 = rowname) %>%
gather(key = source2, value = corr, -source1) %>%
mutate(
type1 = ifelse(source1 %in% news, "news", "party"),
type2 = ifelse(source2 %in% news, "news", "party")
)
p1 <- plot %>%
filter(type1 == "news") %>%
filter(type2 == "news") %>%
ggplot(aes(source1, source2,
label=round(corr, digits = 2),
fill=corr)) +
geom_tile() +
scale_fill_gradient2(low = color, high = color1) +
geom_text(size=1.8) +
theme_hc() +
labs(x = NULL, y = NULL, title = "Medium / Medium") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(legend.position = "none",
axis.text = element_text(size = 6),
axis.text.x = element_text(angle = 90))
p2 <- plot %>%
filter(type1 == "news") %>%
filter(type2 == "party") %>%
ggplot(aes(source1, source2,
label=round(corr, digits = 2),
fill=corr)) +
geom_tile() +
scale_fill_gradient2(low = color, high = color1) +
geom_text(size=1.8) +
theme_hc() +
labs(x = NULL, y = NULL, title = "Medium / Partei") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(legend.position = "none",
axis.text = element_text(size = 6),
#axis.text.y = element_blank(),
axis.text.x = element_text(angle = 90))
p3 <- plot %>%
filter(type1 == "party") %>%
filter(type2 == "party") %>%
ggplot(aes(source1,
source2,label=round(corr, digits = 2), fill=corr)) +
geom_tile() +
scale_fill_gradient2(low = color, high = color1) +
geom_text(size=1.8) +
theme_hc() +
labs(x = NULL, y = NULL, title = "Partei / Partei") +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(legend.position = "none",
axis.text = element_text(size = 6),
axis.text.x = element_text(angle = 90))
p1 + p2 + p3
#ggsave("../figs/corrplot.png", width = 10, height =4)
radar <- plot %>%
filter(type1 == "party") %>%
filter(type2=="news") %>%
select(source1, source2, corr) %>%
spread(key = source1, value = corr)
ggiraphExtra::ggRadar(radar, aes(color = source2),
rescale = F,
interactive = T,
alpha = 0)
# Print out the dataframe to a latex table
xt <- xtable(radar,
digits = c(0,3,3,3,3,3,3,3),
type="latex",
caption ="Topic correlation")
print(xt, include.rownames = F, file="../writing/tables/correlation.tex" )
Rescale the values
radar_rescaled <- as_tibble(sapply(radar[,-1], function(x) normalize_data2(x) ) ) %>%
mutate(medium = radar$source2)
ggiraphExtra::ggRadar(radar_rescaled,
aes(color = medium),
rescale = F,
interactive = T,
alpha = 0)
Since 2012, the Reuters Institute Digital News Survey has been investigating the media use of digital content. Among others, the following questions are investigated: What websites do you visit to access news online? What is your political orientation?
The study is being conducted simultaneously in 37 countries under the coordination of the Reuters Institute for the Study of Journalism, based in Oxford (UK). The Hans Bredow Institute has been responsible for the German part of the study as a cooperation partner since 2013. Fieldwork was undertaken between 19th - 22nd January 2018 conducting an online survey. Total sample size for Germany was 2038 adults (aged 18+) who access news once a month or more.
reutersDF1 <- readxl::read_excel("../data/reuters_clean.xlsx")
reutersDF2 <- readxl::read_excel("../data/reuters_clean.xlsx", sheet = "orientation")
Q5b. Which of the following brands have you used to access news online in the last week (via websites, apps, social media, and other forms of Internet access)? Please select all that apply.
Which of the brands have you used to access news online in the last week?
Q1F. Some people talk about ‘left’, ‘right’ and ‘centre’ to describe parties and politicians. (Generally socialist parties would be considered ‘left wing’ whilst conservative parties would be considered ‘right wing’). With this in mind, where would you place yourself on the following scale?
p1 <- reutersDF.long %>%
filter(!grepl("know",medium)) %>%
filter(!grepl("None",medium)) %>%
filter(medium %in% keeps) %>%
mutate(
label = (count_relative_m*100),
label_color = ifelse(label > 18, "white", "black")
) %>%
ggplot(aes(reorder(partisan.f, partisan_scale),
reorder(medium, order_relative_m),
fill = count_relative_m)) +
geom_tile() +
scale_fill_gradient2(low = color, high = color1) +
geom_text(aes(label=round(label, digits = 1)), color = color3, size = 2) +
#scale_color_manual(values = c("black"=color3, "white"="white")) +
theme_hc() +
labs(x = NULL, y = NULL) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
theme(legend.position = "none",
axis.text = element_text(size = 6),
axis.text.x = element_text(angle = 90))
p2 <- reutersDF.long %>%
filter(!grepl("know",medium)) %>%
filter(medium %in% keeps) %>%
group_by(medium) %>%
summarise(count_sum = sum(count),
order_relative_m = sum(order_relative_m)) %>%
ggplot(aes(reorder(medium,order_relative_m), count_sum)) +
geom_col(fill = color1) +
geom_text(aes(label=count_sum), hjust = 1, size = 2, color = "white") +
coord_flip() +
theme_hc() +
labs(x = NULL, y = NULL) +
theme(axis.text.y = element_blank(),
axis.text = element_text(size = 6),
axis.ticks.y = element_blank()
)
p1 + p2 + plot_layout(widths = c(2,1))
#ggsave("../figs/reuters3.png", width = 5, height = 3)
# calculate topic mean by source and month
topicmean_monthly <- theta %>%
group_by(topic,source, month, year) %>%
dplyr::summarise(topicmean = mean(sentiment_theta)) %>%
ungroup() %>%
spread(source, topicmean)
media <- unique(model_df %>% filter(type == "news") %>% select(source))
parties <- unique(model_df %>% filter(type == "press") %>% select(source))
rm(corrDF)
for (i in parties$source) {
tempdf <- topicmean_monthly %>%
group_by(month, year) %>%
do(data.frame(Cor=t(cor(.[,media$source], .[,i])))) %>%
gather(medium, cor, 3:9) %>%
mutate(party = i,
medium = gsub("Cor.","",medium)) %>%
ungroup()
if (exists("corrDF")){
corrDF <- rbind(corrDF,tempdf)
} else {
corrDF <- tempdf
}
}
agenda <- corrDF %>%
mutate(date = as.Date(paste0(year,"/",month,"/1")),
cor_norm = normalize_data2(cor)
) %>%
dplyr::mutate(medium = ifelse(medium == "DIE.WELT", "DIE WELT", medium),
medium = ifelse(medium == "ZEIT.ONLINE", "ZEIT ONLINE", medium),
medium = ifelse(medium == "FOCUS.Online", "FOCUS Online", medium),
medium = ifelse(medium == "SPIEGEL.ONLINE", "SPIEGEL ONLINE", medium)
)
p <- agenda %>%
ggplot(aes(date, cor,
color = medium,
linetype = medium)) +
geom_line() +
geom_hline(yintercept = 0, size = 0.3, color = color1) +
facet_wrap(~party) +
theme_hc() +
scale_color_viridis_d() +
labs(y=NULL, x =NULL) +
scale_x_date(date_breaks = "2 month", date_labels = "%b/%y") +
theme(legend.position = "bottom",
legend.title = element_blank()) +
guides(col = guide_legend(nrow = 1))
plotly::ggplotly(p, tooltip=c("cor","medium"))
p
#ggsave("../figs/corr_timeline.png", height = 5, width = 8)